---
title: "S1 Appendix: Does Suffering Suffice? An Experimental Assessment of Desert Retributivism"
author: ""
date: ""
output: 
  bookdown::pdf_document2:
    toc: no
    keep_tex: true
    number_sections: FALSE
geometry: [top=0.85in,footskip=0.75in]
fontsize: [10pt,letterpaper]
documentclass: article
header-includes:
   - \usepackage{color}
   - \usepackage{caption}
   - \usepackage{float}
   - \usepackage{dcolumn}
   - \usepackage{tabu}
csl: plos.csl
bibliography: references_appendix.bib
---

\newcommand*{\secref}[1]{Section~\ref{#1}}

\setcounter{table}{0}
\renewcommand{\thetable}{\Alph{table}}
\renewcommand{\figurename}{Table}

\setcounter{figure}{0}
\renewcommand\thefigure{\Alph{figure}}
\renewcommand{\figurename}{Fig}


```{r setup, include = FALSE}
  library(knitr)
  knitr::opts_chunk$set(
	fig.pos = "H",
	cache = FALSE
)

  set.seed(12345)
  
  rm(list=ls())
```





```{r data-import-cleaning, message=FALSE, warning=FALSE, include=FALSE}
# Packages
  library(haven)
  library(plotly)
  library(dplyr)
  library(stargazer)
  library(readr)
  library(stringr)
  library(xtable)
  library(kableExtra)
  library(knitr)
  library(tidyr)
  library(irr)
  library("ggpubr")
  library(gridExtra)
  library(grid)
  library(purrr)

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}
```

```{r include=FALSE}
 data <- data.frame(read_csv("data.csv"))
 data.turk <- data.frame(read_csv("data-turk.csv"))
 mturk.stats <- data.frame(read_csv("mturk-stats.csv")) 
```

\clearpage

\vspace{4cm}

\tableofcontents

\clearpage

# Summary statistics
Table \@ref(tableA) displays summary statistics for the numeric variables. For the categorical variables we refer to the graphs in the manuscript.

```{r tableA, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE, results="asis"}
data$sex <- as.numeric(dplyr::recode(data$sex, 
                            `Female` = "0", 
                            `Male` = "1"))

    stargazer(data %>% select(age, sex) %>% data.frame(), 
            type="latex", font.size="footnotesize", table.placement="!ht", 
            column.sep.width = ".2pt" , title = "Summary stats for age and sex", 
            digits = 2, rownames = FALSE,
            header=FALSE,
            label = "tableA")
```



# Balance statistics
Table \@ref(tab:tableB) provides balance statistics for sex and age.


```{r tableB, echo=FALSE, message=FALSE, warning=FALSE, results="asis"}

    by.group <- group_by(data, treatment_text) %>%
          summarise(mean.sex = mean(sex, 
                                       na.rm = TRUE),
                    mean.age = mean(age, 
                                       na.rm = TRUE),
                    n.total = n(), 
                    n.mean.justice_perception = sum(!is.na(sex))
                    )

by.group.display <- data.frame(by.group[-7,])
by.group.display <- by.group.display[, -5]



    by.group.display$treatment_text <- str_replace_all(by.group.display$treatment_text, "^happy_", "Happy ")
    by.group.display$treatment_text <- str_replace_all(by.group.display$treatment_text, "^unhappy_", "Unhappy ")
    by.group.display$treatment_text <- str_replace_all(by.group.display$treatment_text, "neutral_", "Neutral ")
    by.group.display$treatment_text <- str_replace_all(by.group.display$treatment_text, "nomoralch", "(No Moral Change)")
    by.group.display$treatment_text <- str_replace_all(by.group.display$treatment_text, "yesmoralch", "(Yes Moral Change)")    


row.names(by.group.display) <- by.group.display$treatment_text





by.group.display <- round(by.group.display[,-1],2)
names(by.group.display) <- c("Sex (Mean)", "Age (Mean)", "N (total)")
knitr::kable(by.group.display, row.names = TRUE, 
             caption = 'Balance Statistics: Sex, Age', format = "latex", booktabs = T) %>%
  kable_styling(font_size = 9) 

```


# Perceived justice: Distribution

```{r figA, message=FALSE, warning=FALSE, include=FALSE}

data %>% group_by(justice_perception) %>% 
    count(justice_perception) %>% 
  ungroup() %>%
    mutate(prop = round(n/sum(n),3)) %>% 
   ggplot(aes(x=justice_perception, y = n)) + 
    geom_col(fill = "gray") + 
      geom_text(aes(x=justice_perception, y = n-5, label = paste(prop*100, "%",sep="")), 
            #position = position_stack(vjust = 0.9), 
            size = 4) + 
  labs(x="Justice perception", y = "N")+ 
         theme_light() +
    scale_x_continuous(breaks=seq(0,10,1), labels = seq(0,10,1)) +
theme(axis.text.x=element_text(angle=35,hjust=1,vjust=1, margin=margin(0.2,0,0.3,0,"cm")), 
      plot.title = element_text(hjust = 0.5),
      plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"))


walk(c('pdf'), ~ ggsave(filename = file.path(paste0("figA.", .x)), device = .x))

```







\begin{figure}[H]
\centering
\caption{Perceived justice: Distribution}\label{figA}
		\includegraphics[width=0.7\linewidth]{figA.pdf}
\begin{flushleft}
\end{flushleft}
\end{figure}



# Crowd-coding of open-ended responses {#sec:crowdcoding}

In total ```r length(unique(data.turk$WorkerId))``` mechanical turk workers participated in our crowd-sourcing task to classify responses to our open-ended question on aims of punishment. Table \@ref(tab:tableC) provides some statistics on the crowdsourcing task. We had ```r length(unique(data.turk$Input.participant_id))``` responses. The idea was to classify each response by 4 raters which would result in a total number of ```r length(unique(data.turk$Input.participant_id))*4``` assignments. In the end our data comprised ```r mturk.stats$value[mturk.stats$statistic=="number.of.assignments"]``` analyzable assignments. We crowd-sourced the data in 5 batches in order to be able to assess the rating quality and other statistics along the way. As suggested by @Williamson2016-az [p.79] we tried to pay workers above the minimum wage of 7.25$. On average our workers recieved a wage of `r round(mturk.stats$value[mturk.stats$statistic=="wage.per.hour"], 2)` $ per hour. Depending on their speed their wage may vary. Mechanical turk workers that were accepted for our task needed to be located in the U.S., have a HIT Approval Rate (%) for all Requesters' HITs greater than 97%, have a number of HITs Approved greater than 1000 and needed to have 'Masters' granted. Masters are elite groups of Workers who have demonstrated accuracy on specific types of HITs on the Mechanical Turk marketplace. We added Masters requirement after Batch 1 and noticed a considerable increase in response quality.        
The crowd-sourcing task is depicted in Figure \@ref(fig-mechanicalturk-aims). We provided raters with a set of possible aims of punishment and asked them to classify the responses regarding whether certain aims were mentioned or implied by a respondent's answer. For this task we did not randomize the ranking of the categories since we wanted raters to get used to the classification interface.

```{r tableC, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE, results="asis"}
  x <- mturk.stats %>% slice(c(4,5,6,7,8,9))
x$statistic <- gsub("\\.", " ", paste(gsub("_", " (", x$statistic), ")", sep=""))
x$statistic <- paste0(toupper(substr(x$statistic, 1, 1)), substr(x$statistic, 2, nchar(x$statistic)))
names(x) <- c("Statistic", "Value")
  knitr::kable(x, 
               caption = 'Crowdsourcing stats', format = "latex", booktabs = T) %>%
  kable_styling(font_size = 9) 
```




```{r data-krippendorfs-alpha, echo=FALSE}
# Generate coding variables
  data.turk$rate_suffering <- 0
  data.turk$rate_suffering[str_detect(data.turk$classification, "suffering")] <- 1
  data.turk$rate_deterrence <- 0
  data.turk$rate_deterrence[str_detect(data.turk$classification, "deterrence")] <- 1
  data.turk$rate_reintegration <- 0
  data.turk$rate_reintegration[str_detect(data.turk$classification, "reintegration")] <- 1 
  data.turk$rate_rehabilitation <- 0
  data.turk$rate_rehabilitation[str_detect(data.turk$classification, "rehabilitation")] <- 1 
  data.turk$rate_amends <- 0
  data.turk$rate_amends[str_detect(data.turk$classification, "amends")] <- 1 
  data.turk$rate_vengeance <- 0
  data.turk$rate_vengeance[str_detect(data.turk$classification, "vengeance")] <- 1 
  data.turk$rate_awareness <- 0
  data.turk$rate_awareness[str_detect(data.turk$classification, "awareness")] <- 1 
  data.turk <- as.data.frame(data.turk)

  

  
# GENERATE MATRIX TO CALCULATE KRIPP'S ALPHA
  # ROWS = WORKERS
  # COLUMS = Responses to classify (= RespondentIDs)
  data.turk.irr <- data.turk %>% ungroup() %>% select(WorkerId, Input.participant_id, Input.variable, classification)
  df.null <- data.turk.irr %>% complete(WorkerId, Input.participant_id) %>% arrange(WorkerId) %>%
              select(-Input.variable) 

    df.null <- df.null %>% spread(key = Input.participant_id, value = classification)
  
  
# Generate matrices for Kripppendorf's alpha for all categories
  for(y in c("rate_suffering", "rate_deterrence", "rate_reintegration", 
             "rate_rehabilitation", "rate_amends", "rate_vengeance", "rate_awareness")){
    
      y <- gsub("rate_", "", y)
      df <- df.null
      
      
  for(i in 1:nrow(df)){
    for(z in 2:ncol(df)){
      
      if(!is.na(df[i, z])){
      if(str_detect(df[i, z], y)==TRUE){df[i, z] <- 1}else{df[i, z] <- 0}
      }}
  }
  df <- df %>% select(-WorkerId)
  df <- sapply(df, as.numeric)
  df <- as.matrix(df)
  assign(paste("df_rate_", y, sep=""), df)  
  }
```
  

  
```{r tableD, echo=FALSE, results="asis"}
  
# Calculate kripp.alpha
  alpha.suffering <- irr::kripp.alpha(df_rate_suffering)
  alpha.deterrence <- irr::kripp.alpha(df_rate_deterrence)
  alpha.reintegration <- irr::kripp.alpha(df_rate_reintegration)
  alpha.rehabilitation <- irr::kripp.alpha(df_rate_rehabilitation)
  alpha.amends <- irr::kripp.alpha(df_rate_amends)
  alpha.vengeance <- irr::kripp.alpha(df_rate_vengeance)
  alpha.awareness <- irr::kripp.alpha(df_rate_awareness)
  
  irr.table <- data.frame(Category = c("Suffering", "Deterrence", 
                                       "Reintegration", "Rehabilitation", 
                                       "Amends", "Vengeance", "Awareness"),
                          Alpha = c(
                             alpha.suffering$value,
                              alpha.deterrence$value,
                              alpha.reintegration$value,
                              alpha.rehabilitation$value,
                              alpha.amends$value,
                              alpha.vengeance$value,
                              alpha.awareness$value),
                          Responses = c(
                             alpha.suffering$subjects,
                              alpha.deterrence$subjects,
                              alpha.reintegration$subjects,
                              alpha.rehabilitation$subjects,
                              alpha.amends$subjects,
                              alpha.vengeance$subjects,
                              alpha.awareness$subjects),  
                            Raters = c(
                             alpha.suffering$raters,
                              alpha.deterrence$raters,
                              alpha.reintegration$raters,
                              alpha.rehabilitation$raters,
                              alpha.amends$raters,
                              alpha.vengeance$raters,
                              alpha.awareness$raters)) 
  irr.table$Alpha <- round(irr.table$Alpha, 2)
  

  knitr::kable(irr.table, 
               caption = "Interrater-reliability: Krippendorf's alpha", format = "latex", booktabs = T) %>%
  kable_styling(font_size = 9)                         

```  

Since not all raters coded all responses we use Krippendorf's alpha as a measure of interrater reliability [@Krippendorff2013-lj]. We calculated alpha for each of the 7 categories into which raters could categorize a response. The results are depicted in Table \@ref(tab:tableD). Krippendorf's Alpha ranges from `r round(min(irr.table$Alpha), 2)` to `r round(max(irr.table$Alpha), 2)`, i.e., we get categories for which it is relatively satisfying, e.g., rehabilitation, and categories for which is less satisfying, e.g., vengeance.       

\begin{figure}[H]
\centering
\caption{Crowd-coding of responses: Aims of punishment}\label{fig-mechanicalturk-aims}
		\includegraphics[width=1\linewidth]{figB.pdf}
\begin{flushleft}
\end{flushleft}
\end{figure}
\vspace{-0.75cm}

For the main analysis in the paper we chose a conservative strategy. We only coded a response as belonging to a category such as "suffering" when at least 3 out of 4 raters agreed that it belonged to that particular category. This is a rather strict cutoff and could mean that we underestimate the prevalence of certain aims in the responses. However, we assume that any such underestimation is relatively constant across aims, hence, it shouldn't affect our conclusions about Hypothesis 1.   
Crowd-coding is both hailed as a useful strategy but also viewed critically [@Snow2008-gq; @Benoit2016-gp; @Lind2017-fo; @Dreyfuss2018-im]. Because Krippendorf's alpha was not higher for certain categories we carried out additonal analyses to see whether our results remain robust to the exclusion of certain workers.
Some workers may take the task less seriously than others which leads to measurement error. Below we excluded the codings of workers that finished the assignments in an average time lower than 0.3 minutes, or longer than 5 minutes as well as coded only 1 response. Extremely low average times may reflect superficial codings. Very long times may indicate that workers worked on several parallel assignments and only finished them once the time ran out. Furthermore, we assume that the quality of coding may improve once workers get used to the coding scheme. The results for this rater subsample are depicted in Table \@ref(tab:tableE) and Table \@ref(tab:tableF). Krippendorf's alpha slightly increase for most of the categories. However, the main findings, namely the comparably low share of responses mentioning suffering as aim of punishment, does not change. In Table \@ref(tab:tableF) the share of responses that are classified as mentioning the aim of suffering is even lower than before the exclusion of certain raters.





```{r analyze-worker-statistics, include=FALSE}
worker.stats <- data.turk %>% group_by(WorkerId) %>% 
                              summarise(n.assignments = n(),
                                                               av.seconds = mean(time.seconds), 
                                                               av.minutes = mean(time.minutes)
                                                               ) %>%
                          mutate(av.seconds = round(av.seconds,0), av.minutes = round(av.minutes,1))
                                                
kable(worker.stats)

# Filtering good workers
  worker.stats <-  worker.stats %>% filter(av.minutes>0.3, av.minutes<5, n.assignments>1)
  
# Subest data.turk keeping only "good" workers
  data.turk <- data.turk %>% filter(WorkerId %in% worker.stats$WorkerId)
   
```



```{r data-krippendorfs-alpha2, echo=FALSE}
# Generate coding variables
  data.turk$rate_suffering <- 0
  data.turk$rate_suffering[str_detect(data.turk$classification, "suffering")] <- 1
  data.turk$rate_deterrence <- 0
  data.turk$rate_deterrence[str_detect(data.turk$classification, "deterrence")] <- 1
  data.turk$rate_reintegration <- 0
  data.turk$rate_reintegration[str_detect(data.turk$classification, "reintegration")] <- 1 
  data.turk$rate_rehabilitation <- 0
  data.turk$rate_rehabilitation[str_detect(data.turk$classification, "rehabilitation")] <- 1 
  data.turk$rate_amends <- 0
  data.turk$rate_amends[str_detect(data.turk$classification, "amends")] <- 1 
  data.turk$rate_vengeance <- 0
  data.turk$rate_vengeance[str_detect(data.turk$classification, "vengeance")] <- 1 
  data.turk$rate_awareness <- 0
  data.turk$rate_awareness[str_detect(data.turk$classification, "awareness")] <- 1 
  data.turk <- as.data.frame(data.turk)

  

  
# GENERATE MATRIX TO CALCULATE KRIPP'S ALPHA
  # ROWS = WORKERS
  # COLUMS = Responses to classify (= RespondentIDs)
  data.turk.irr <- data.turk %>% ungroup() %>% select(WorkerId, Input.participant_id, Input.variable, classification)
  df.null <- data.turk.irr %>% complete(WorkerId, Input.participant_id) %>% arrange(WorkerId) %>%
              select(-Input.variable) 

    df.null <- df.null %>% spread(key = Input.participant_id, value = classification)
  
  
# Generate matrices for Kripppendorf's alpha for all categories
  for(y in c("rate_suffering", "rate_deterrence", "rate_reintegration", 
             "rate_rehabilitation", "rate_amends", "rate_vengeance", "rate_awareness")){
    
      y <- gsub("rate_", "", y)
      df <- df.null
      
      
  for(i in 1:nrow(df)){
    for(z in 2:ncol(df)){
      
      if(!is.na(df[i, z])){
      if(str_detect(df[i, z], y)==TRUE){df[i, z] <- 1}else{df[i, z] <- 0}
      }}
  }
  df <- df %>% select(-WorkerId)
  df <- sapply(df, as.numeric)
  df <- as.matrix(df)
  assign(paste("df_rate_", y, sep=""), df)  
  }
```



```{r tableE, echo=FALSE, results="asis"}
  
# Calculate kripp.alpha
  alpha.suffering <- irr::kripp.alpha(df_rate_suffering)
  alpha.deterrence <- irr::kripp.alpha(df_rate_deterrence)
  alpha.reintegration <- irr::kripp.alpha(df_rate_reintegration)
  alpha.rehabilitation <- irr::kripp.alpha(df_rate_rehabilitation)
  alpha.amends <- irr::kripp.alpha(df_rate_amends)
  alpha.vengeance <- irr::kripp.alpha(df_rate_vengeance)
  alpha.awareness <- irr::kripp.alpha(df_rate_awareness)
  
  irr.table <- data.frame(Category = c("Suffering", "Deterrence", 
                                       "Reintegration", "Rehabilitation", 
                                       "Amends", "Vengeance", "Awareness"),
                          Alpha = c(
                             alpha.suffering$value,
                              alpha.deterrence$value,
                              alpha.reintegration$value,
                              alpha.rehabilitation$value,
                              alpha.amends$value,
                              alpha.vengeance$value,
                              alpha.awareness$value),
                          Responses = c(
                             alpha.suffering$subjects,
                              alpha.deterrence$subjects,
                              alpha.reintegration$subjects,
                              alpha.rehabilitation$subjects,
                              alpha.amends$subjects,
                              alpha.vengeance$subjects,
                              alpha.awareness$subjects),  
                            Raters = c(
                             alpha.suffering$raters,
                              alpha.deterrence$raters,
                              alpha.reintegration$raters,
                              alpha.rehabilitation$raters,
                              alpha.amends$raters,
                              alpha.vengeance$raters,
                              alpha.awareness$raters)) 
  irr.table$Alpha <- round(irr.table$Alpha, 2)
  

  knitr::kable(irr.table, 
               caption = "Interrater-reliability: Krippendorf's alpha", format = "latex", booktabs = T) %>%
  kable_styling(font_size = 9)                         

  
```  

```{r data-preparation-crowd-coding, message=FALSE, warning=FALSE, include=FALSE}

  
  x <- data.turk %>% select(coder,Input.participant_id, Input.variable, classification) %>%
              spread(key = coder, value = classification) %>% 
    mutate(suffering = NA,
           deterrence = NA,
           reintegration = NA,
           rehabilitation = NA,
           amends = NA,
           vengeance = NA,
           awareness = NA) 

  # Create counts of mentionings for each aim
    for(i in 1:nrow(x)){
      # Appearance summed up across coders
      x[i,"suffering"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "suffering"), na.rm = TRUE)
      x[i,"deterrence"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "deterrence"), na.rm = TRUE)
      x[i,"reintegration"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "reintegration"), na.rm = TRUE)
      x[i,"rehabilitation"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "rehabilitation"), na.rm = TRUE)
      x[i,"amends"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "amends"), na.rm = TRUE)
      x[i,"vengeance"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "vengeance"), na.rm = TRUE)
      x[i,"awareness"] <- sum(str_detect(as.character(x[i,c("coder1", "coder2", "coder3", "coder4")]), "awareness"), na.rm = TRUE)
    }

  
  
      # Appearance of aim for each coder
         for(y in c("suffering", "deterrence", "reintegration", "rehabilitation", 
"amends", "vengeance", "awareness")){
           for(z in c("coder1", "coder2", "coder3", "coder4")){
           x[,paste(z, y, sep = "_")] <- NA
           for(i in 1:nrow(x)){
            x[i,paste(z, y, sep = "_")] <-  str_detect(as.character(x[i,c(z)]), y) # replace value of coder z, aim y, row i with TRUE/FALSE depending on whether y appears
           }}}
      


  # Recode counts to 0/1 depending on proportion      
x <- x %>% 
     mutate_at(c("suffering", "deterrence", "reintegration", "rehabilitation", 
"amends", "vengeance", "awareness"), funs(recode(., `0`=0, `1`=0,`2`=0, `3`=1, `4`=1,.default = NaN))) %>% # Cutoff: At least 3 out of 4 have to mention that as an aim in the answer
  mutate_at(c("suffering", "deterrence", "reintegration", "rehabilitation", 
"amends", "vengeance", "awareness"), funs(as.logical))


  
  names(x)[7:13] <- paste("Mention_aim_of_", names(x)[7:13], sep="") # replace names of recoded count variables
  
  x <- x %>% select(Input.participant_id, Input.variable, contains("Mention_aim")) %>% rename(participant_id = Input.participant_id, AimOpen2 = Input.variable)
  

# Delete old mention aim variables
  data <- data %>% select(-contains("Mention_aim"))
  
  
  data <- left_join(data, x, by = "participant_id")
  
```


```{r tableF, echo=FALSE, message=FALSE, warning=FALSE, results='asis'}
  x <- data %>% 
    dplyr::select(contains("Mention_aim")) %>%
      summarise_all(funs(mean(., na.rm = TRUE))) %>% t() %>% round(.,2) %>% as.data.frame()
  x2 <- data %>% 
    dplyr::select(contains("Mention_aim")) %>%
      summarise_all(funs(sum(., na.rm = TRUE))) %>% t() %>% round(.,2) %>% as.data.frame()
  
  x <- cbind(x, x2)

  row.names(x) <- str_replace_all(row.names(x), "_", " ")

 
  x$V1 <- x$V1*100
    names(x) <- c("% responses", "N responses")
  
  knitr::kable(x, 
               caption = 'Share of open-ended answers that mention particular aims', format = "latex", booktabs = T) 
```







Finally, while Table \@ref(tab:tableF) depicts the prevalence of certain aims across all respondents, Table \@ref(tab:tableG) depicts the prevalence of certain aims of punishment split across treatment groups. In other words, since we collected the data to test H1 after our survey experiment we could be worried that the considerations queried through the open-ended question are affected by our survey experiment. Table \@ref(tab:tableG) allows us to explore whether participants's open-ended answers seem to have been influenced by our experimental treatments, i.e., by our experiment. While there are some differences these do not seem to be strong enough to be problematic for a test of Hypothesis 1.

```{r tableG, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE, results='asis'}
  x <- data %>% 
    group_by(treatment_text) %>% 
    dplyr::select(contains("Mention_aim")) %>%
      summarise_all(funs(mean)) %>% 
        mutate_if(is.numeric, funs(round(., 2))) %>% 
          mutate_if(is.numeric, funs(. * 100))
    names(x) <- str_replace_all(names(x), "_aim_of_", " ")
    names(x) <- str_replace_all(names(x), "_", " ")
    names(x) <- paste(names(x), " (%)", " ")
    names(x)[1] <- "Treatment"

  knitr::kable(x, 
               caption = 'Share of open-ended responses that mention particular justifications/aims across treatment groups', format = "latex", booktabs = T) %>%
  kable_styling(font_size = 7, full_width = TRUE) 
```


# Analysis of variance {#sec:anova}
In addition to the comparisons and models estimated in our 'Results' Section we carried out classical ANOVA analyses. Figure \@ref(figC) displays the averages across all treatment groups. Figure \@ref(figD) displays the averages in the treatment groups with samples being split according to values of our two treatment variables --- Suffering and Moral Change --- independently from the respective other variable. The actual data was spread out using jitter.




```{r data-preparation-anova, include=FALSE}
data$MoralChange_fac <- ordered(data$MoralChange,
                                levels = c("no", "yes"))
data$Suffering_fac <- ordered(data$Suffering,
                              levels = c("unhappy", "neutral", "happy"))

data$treatment_text_fac <- factor(data$treatment_text) 
```
 
```{r figC, fig.align='center', fig.cap="\\label{fig:groupdistribution-all}Means and distributions across all treatment groups", fig.show='hold', message=FALSE, warning=FALSE, include=FALSE, out.width='.7\\linewidth', paged.print=FALSE}


    labels <- str_replace_all(levels(data$treatment_text_fac), "^happy_", "Happy ")
    labels <- str_replace_all(labels, "^unhappy_", "Unhappy ")
    labels <- str_replace_all(labels, "neutral_", "Neutral ")
    labels <- str_replace_all(labels, "nomoralch", "(No Moral Change)")
    labels <- str_replace_all(labels, "yesmoralch", "(Yes Moral Change)")  
    levels(data$treatment_text_fac) <- labels
    levels(data$treatment_text_fac) <- c("Happy\n (No Moral Change)", "Happy\n  (Yes Moral Change)", "Neutral\n  (No Moral Change)", 
"Neutral\n  (Yes Moral Change)", "Unhappy\n  (No Moral Change)", "Unhappy\n  (Yes Moral Change)"
)


ggline(data, x = "treatment_text_fac", y = "justice_perception", 
       add = c("mean_se", "jitter"), 
       order =levels(data$treatment_text_fac),
       ylab = "Perceived justice", xlab = "Group comparison",
       color = "gray", point.color = "black") +
    theme(axis.text.x = element_text(size=7)) 

ggsave("figC.pdf")
```


\begin{figure}[!ht]
\centering
\caption{Means and distributions across all treatment groups}\label{figC}
		\includegraphics[width=.9\linewidth]{figC.pdf}
\begin{flushleft}
\end{flushleft}
\end{figure}

```{r figD, fig.align='center', fig.cap="\\label{fig:groupdistribution}Means and distributions for sample split according to the two treatment variables", fig.height=3, fig.show='hold', fig.width=3, message=FALSE, warning=FALSE, include=FALSE, out.width='.49\\linewidth', paged.print=FALSE}


library("ggpubr")

p1 <- ggline(data, x = "MoralChange_fac", y = "justice_perception", 
       add = c("mean_se", "jitter"), 
       order = c("no", "yes"),
       ylab = "Perceived justice", xlab = "Group comparison:\n Moral Change",
       color = "gray", point.color = "black")

p2 <- ggline(data, x = "Suffering_fac", y = "justice_perception", 
       add = c("mean_se", "jitter"), 
       order = c("unhappy", "neutral", "happy"),
       ylab = "Perceived justice", xlab = "Group comparison:\n Suffering",
       color = "gray", point.color = "black")


#pdf("figD.pdf", width = 7, height = 3)
#g <- grid.arrange(p1, p2, ncol=2)
#dev.off()


g <- grid.arrange(p1, p2, ncol=2)
walk(c('pdf'), ~ ggsave(filename = file.path(paste0("figD.", .x)), device = .x, plot = g, width = 7, height = 3))


```

\begin{figure}[!ht]
\centering
\caption{Means and distributions for sample split according to the two treatment variables}\label{figD}
		\includegraphics[width=.9\linewidth]{figD.pdf}
\begin{flushleft}
\end{flushleft}
\end{figure}


```{r anova, include=FALSE}
# Compute the analysis of variance
fit.moralchange <- aov(justice_perception ~ MoralChange_fac, data = data)
# Summary of the analysis
summary(fit.moralchange)


# Compute the analysis of variance
fit.suffering <- aov(justice_perception ~ Suffering_fac, data = data)
# Summary of the analysis
summary(fit.suffering)


# Compute the analysis of variance
fit.treatment_text <- aov(justice_perception ~ treatment_text_fac, data = data)
# Summary of the analysis
summary(fit.treatment_text)

```

```{r anova-tukey, include=FALSE}
options(scipen=999)
TukeyHSD(fit.moralchange)
diff.moralchange <- round(TukeyHSD(fit.moralchange)$`MoralChange_fac`[1],2)
pvalue.moralchange <- round(TukeyHSD(fit.moralchange)$`MoralChange_fac`[4],2)
TukeyHSD(fit.suffering)
diff.suffering <- round(TukeyHSD(fit.suffering)$`Suffering_fac`[2,1],2)
pvalue.suffering <- round(TukeyHSD(fit.suffering)$`Suffering_fac`[2,4],2)
TukeyHSD(fit.treatment_text)
```

One-way ANOVA tests yield significant p-values for groups means for both the Suffering treatment (P-value = 0.033) and Moral Change treatment (P-value = 2.15e-09) indicating that some of the group means are different. While there are only two subsamples (groups or values) for Moral Change, we don't know which combinations of the three Suffering subsamples (groups or values) display statistically significant differences. One-way ANOVA tests splitting the sample into groups corresponding to the 6 treatment groups yield the same result.                       
In a next step we perform multiple pairwise-comparison computing Tukey Honest Significant Differences [@Yandell2017-ww], to determine if the mean difference between specific pairs of groups are statistically significant. We find that there is a highly statistically significant difference comparing the Moral Change treatments ("no" vs. "yes"). The difference lies at `r diff.moralchange` (P-value = `r sprintf('%.2f',pvalue.moralchange)`). For Suffering there is a significant difference of `r diff.suffering` when we compare the "unhappy" to the "happy" category (P-value = `r sprintf('%.2f',pvalue.suffering)`), i.e., the two extreme categories on this three-point scale. The differences between neutral-unhappy and happy-neutral are not statistically significant. ANOVA tests assume normally distributed data and homogeneous variance across groups. We checked the homogeneity of variance assumption relying on Levene's test [@Fox2016-it]. The test indicates a violation for groups of Moral Change but not for groups of Suffering. For this reason we compute a non-parametric alternative to the one-way ANOVA test, namely the Kruskal-Wallis rank sum test [@Hollander1973-sp, 115-120]. The results from the rank sum test indicate that there are significant differences between our treatment groups for our two treatment variables Moral Change and Suffering. These results reflect the findings from our main analysis. For this reason we refer the reader back to the 'Results' Section in the main paper.    

```{r anova-levene, eval=FALSE, include=FALSE}
library(car)
leveneTest(justice_perception ~ MoralChange_fac, data = data)
leveneTest(justice_perception ~ Suffering_fac, data = data)
leveneTest(justice_perception ~ treatment_text_fac, data = data)
```


```{r anova-kruskal, eval=FALSE, include=FALSE}
kruskal.test(justice_perception ~ MoralChange_fac, data = data)
kruskal.test(justice_perception ~ Suffering_fac, data = data)
kruskal.test(justice_perception ~ treatment_text_fac, data = data)
```


\clearpage


# Contrasting open-ended, ranking and classic retributivism scale

Table \@ref(tab:tableH) displays the open-ended responses after they have been classified according to whether they mentioned particular aims of punishment. However, as opposed to Table 1 in the main paper we now show the marginal distributions for different values on the retributivism scale that has 11 values. Specifically, we show those distributions for respondents with low values on the scale (0-3) and for respondents with high values on the scale (7-10). As was to be expected the share of respondents that mention suffering as an aim in their open-ended response is higher among those that also picked high values on the retributivsm scale. Nontheless, those shares are lower than one would expect. To some extent this is certainly related to the way we coded those open-ended responses. However, even if those values would vary because of a different coding scheme, the numbers would still be in the lower range. Further below we contrast the explicit retributivism scale with the ranking question on aims of punishment. 


```{r tableH, echo=FALSE, message=FALSE, warning=FALSE, results='asis'}

table_null <- structure(list(empty = c(NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("Mention_aim_of_suffering", 
"Mention_aim_of_deterrence", "Mention_aim_of_reintegration", 
"Mention_aim_of_rehabilitation", "Mention_aim_of_amends", "Mention_aim_of_vengeance", 
"Mention_aim_of_awareness"))
for(i in c(0:3, 7:10)){ # 0:10
    x <- data %>% filter(retributivist_scale==i) %>%
    dplyr::select(contains("Mention_aim")) %>%
      summarise_all(funs(mean(., na.rm = TRUE))) %>% t() %>% round(.,2) %>% as.data.frame()
  x2 <- data %>% filter(retributivist_scale==i) %>%
    dplyr::select(contains("Mention_aim")) %>%
      summarise_all(funs(sum(., na.rm = TRUE))) %>% t() %>% round(.,2) %>% as.data.frame()
  x3 <- bind_cols(x, x2)
  x3 <- x3 %>% mutate(combine = paste(V1, " (", V11, ")", sep="")) %>% select(combine)
  names(x3) <- i
 
  table_null <- bind_cols(table_null, x3)
  
}

table_null <- table_null %>% select(-empty)
  
row.names(table_null) <- c("Mention_aim_of_suffering", 
"Mention_aim_of_deterrence", "Mention_aim_of_reintegration", 
"Mention_aim_of_rehabilitation", "Mention_aim_of_amends", "Mention_aim_of_vengeance", 
"Mention_aim_of_awareness")


  row.names(table_null) <- str_replace_all(row.names(table_null), "_", " ")

 
  # table_null$V1 <- table_null$V1*100
    # names(table_null) <- c("% responses", "N responses")
  
  knitr::kable(table_null, 
               caption = 'Share of open-ended answers that mention particular aims for particular reponses on the closed retributivist scale', format = "latex", booktabs = T) %>%
  kable_styling(font_size = 7, full_width = TRUE, latex_options = "hold_position") 
```

Figure \@ref(figE) visualizes the results of the ranking question that provides respondents with a pre-defined choice set of aims of punishment. However, now we visualize those rankings for subsets of participants that picked particular values on the retributivism scale, either low values (0-3) or high values (7-10). Again we can observe that respondents that pick high values on the retributivism scale more often rank the aim of desert first. However, by far not everyone does. For instance, across both low and high values of the classic retributivism scale a large share of people rank the aim of deterrence in the first place. In other words, when contrasted with the classic retributivism scale both our open-ended measure and our ranking measure reveal that while there is overlap, there is also considerable variation behind the same value on this scale.


```{r figE, fig.cap="Rankings of aims for different values on the redistributive scale", fig.height=9, message=FALSE, warning=FALSE, include=FALSE, out.extra=''}


# GGPLOT2

for(i in c(0:3, 7:10)){


  data <- data.frame(data)

  plot.data <- data %>% 
    filter(retributivist_scale==i) %>% 
    dplyr::select(contains("AimRank"), -AimRankTime)
  names(plot.data) <- str_replace(names(plot.data), "Aim", "")
  x <- plot.data %>% gather() %>% group_by(key, value) %>% count(value) %>% rename(xaxis = key, Aims = value) %>% ungroup() %>% mutate(Aims = str_to_title(Aims))
  x$xaxis <- gsub("k\\.", "k ",x$xaxis)
    x$xaxis <- gsub("\\.", "",x$xaxis)
    nobservations <- sum(x$n[x$xaxis=="Rank 1"])
    
     x$Aims <- recode(x$Aims, "Suffering" = "Desert")
    
# Stacked + percent
p <- ggplot(x, aes(fill=Aims, y=n, x=xaxis)) + 
    geom_bar(position="fill", stat="identity") +
  theme_light() +
  theme(legend.position="bottom",
        plot.title = element_text(size = 8))+
  ylab("") +
  xlab("") +
    ggtitle(paste("Redistributivism scale value: ", i, " (",nobservations," respondents)",sep=""))+ 
          scale_y_continuous(labels=scales::percent) + guides(col = guide_legend(ncol = 5))

assign(paste("p", i, sep=""), p)

}


# grid.arrange(p0, p1, p2, p3, p7, p8, p9, p10, ncol=2)




mylegend<-g_legend(p0)


# pdf("figE.pdf", width = 7, height = 9)
# grid.arrange(mylegend, arrangeGrob(p0 + theme(legend.position="none"),
#                                    p1 + theme(legend.position="none"),
#                                    p2 + theme(legend.position="none"),
#                                    p3 + theme(legend.position="none"),
#                                    p7 + theme(legend.position="none"),
#                                    p8 + theme(legend.position="none"),
#                                    p9 + theme(legend.position="none"),
#                                    p10 + theme(legend.position="none"),
#                                    nrow=4),
#                    left = grid::textGrob("Share of aims mentioned for each rank", rot = 90, vjust = 1),
#                    bottom = grid::textGrob("Rankings", rot = 0, vjust = 0.5),
#                    nrow=2, heights=c(1, 10))
# dev.off()




g <- grid.arrange(mylegend, arrangeGrob(p0 + theme(legend.position="none"),
                                   p1 + theme(legend.position="none"),
                                   p2 + theme(legend.position="none"),
                                   p3 + theme(legend.position="none"),
                                   p7 + theme(legend.position="none"),
                                   p8 + theme(legend.position="none"),
                                   p9 + theme(legend.position="none"),
                                   p10 + theme(legend.position="none"),
                                   nrow=4),
                   left = grid::textGrob("Share of aims mentioned for each rank", rot = 90, vjust = 1),
                   bottom = grid::textGrob("Rankings", rot = 0, vjust = 0.5),
                   nrow=2, heights=c(1, 10))
walk(c('pdf'), ~ ggsave(filename = file.path(paste0("figE.", .x)), device = .x, plot = g, width = 7, height = 9))



```  


\begin{figure}[!ht]
\centering
\caption{Rankings of aims for different values on the redistributive scale}\label{figE}
		\includegraphics[width=1\linewidth]{figE.pdf}
\begin{flushleft}
\end{flushleft}
\end{figure}



\clearpage

# R session info 

```{r echo=FALSE}
print(sessionInfo(), local = FALSE)
```

\clearpage

# References

